Main Road:
The main road variable is yes/no based on the street of the home. We will replace this with a dummy variable.
## mainroad n
## 1 no 50
## 2 yes 336
\(~\)
Use 250 words or less to summarize your problem, methodology, and major outcomes.
[insert text here]
\(~\)
house prices, linear models, regression, home investment, real estate
\(~\)
Describe the background and motivation of your problem.
[insert text here]
\(~\)
Discuss how other researchers have addressed similar problems, what their achievements are, and what the advantage and drawbacks of each reviewed approach are. Explain how your investigation is similar or different to the state-of-the- art. Please cite the relevant papers where appropriate.
[insert text here]
\(~\)
Discuss the key aspects of your problem, data set and regression model(s). Given that you are working on real-world data, explain at a high-level your exploratory data analysis, how you prepared the data for regression modeling, your process for building regression models, and your model selection.
[insert text here]
\(~\)
Describe the specifics of what you did (data exploration, data preparation, model building, model selection, model evaluation, etc.), and what you found out (statistical analyses, interpretation and discussion of the results, etc.). Conclude your findings, limitations, and suggest areas for future work.
[insert text here]
\(~\)
[insert text here]
\(~\)
[insert text here]
\(~\)
[insert text here]
\(~\)
Be sure to cite all references used in the report (APA format).
H, M. Y. (2022, January 12). Housing prices dataset. Kaggle. Retrieved November 28, 2022, from https://www.kaggle.com/datasets/yasserh/housing-prices-dataset
Schmidt, A. (2018, June). Linear Regression and the normality assumption. Retrieved December 5, 2022, from https://doi.org/10.1016/j.jclinepi.2017.12.006
[insert text here]
\(~\)
# load libraries
library(tidyverse)
library(dplyr)
library(corrplot)
library(MASS)
library(dvmisc)
library(car)
library(lmtest)
library(olsrr)
library(caret)
# load data
house_prices <- read.csv("https://raw.githubusercontent.com/letisalba/Data_621/master/Final_Project/csv/Housing.csv")
# make this example reproducible
set.seed(1)
# use 70% of data set as training set and 30% as evaluation set
sample <- sample(c(TRUE, FALSE), nrow(house_prices), replace=TRUE, prob=c(0.7,0.3))
dftrain <- house_prices[sample, ]
dfeval <- house_prices[!sample, ]
head(dftrain)
# explore data
str(dftrain)
str(dfeval)
summary(dftrain)
summary(dfeval)
# distribution of the price ranges in our data
options(scipen=5)
hist(dftrain$price / 10^6, main = 'Distribution of Price', xlab = 'Price in millions')
# distribution of area
hist(dftrain$area, main = 'Distribution of Area', xlab = 'Area in Square Feet')
# looking at all the variables separately
dftrain %>% count(bedrooms)
dftrain %>% count(bathrooms)
dftrain %>% count(stories)
dftrain %>% count(parking)
dftrain %>% count(furnishingstatus)
dftrain %>% count(mainroad)
dftrain %>% count(guestroom)
dftrain %>% count(basement)
dftrain %>% count(hotwaterheating)
dftrain %>% count(airconditioning)
dftrain %>% count(prefarea)
# clean function to create dummy variables to replace our categorical variables
fun_clean_df <- function(df) {
df <- df %>%
mutate(bed2 = ifelse(bedrooms == 2,1,0)) %>%
mutate(bed3 = ifelse(bedrooms == 3,1,0)) %>%
mutate(bed4 = ifelse(bedrooms == 4,1,0)) %>%
mutate(bed5 = ifelse(bedrooms == 5,1,0)) %>%
mutate(bed6plus = ifelse(bedrooms >= 6,1,0)) %>% dplyr::select(-bedrooms) %>%
mutate(bath2 = ifelse(bathrooms == 2,1,0)) %>%
mutate(bath3 = ifelse(bathrooms == 3,1,0)) %>%
mutate(bath4plus = ifelse(bathrooms >= 4,1,0)) %>% dplyr::select(-bathrooms) %>%
mutate(floor2 = ifelse(stories == 2,1,0)) %>%
mutate(floor3 = ifelse(stories == 3,1,0)) %>%
mutate(floor4plus = ifelse(stories >= 4,1,0)) %>% dplyr::select(-stories) %>%
mutate(car1 = ifelse(parking == 1,1,0)) %>%
mutate(car2 = ifelse(parking == 2,1,0)) %>%
mutate(car3plus = ifelse(parking >= 3,1,0)) %>% dplyr::select(-parking) %>%
mutate(semifurnished = ifelse(furnishingstatus == 'semi-furnished',1,0)) %>%
mutate(furnished = ifelse(furnishingstatus == 'furnished',1,0)) %>% dplyr::select(-furnishingstatus) %>%
mutate(mainroad = ifelse(mainroad == 'yes',1,0)) %>%
mutate(guestroom = ifelse(guestroom == 'yes',1,0)) %>%
mutate(basement = ifelse(basement == 'yes',1,0)) %>%
mutate(hotwaterheating = ifelse(hotwaterheating == 'yes',1,0)) %>%
mutate(ac = ifelse(airconditioning == 'yes',1,0)) %>% dplyr::select(-airconditioning) %>%
mutate(neighborhood = ifelse(prefarea == 'yes',1,0)) %>% dplyr::select(-prefarea)
}
dftrain_clean <- fun_clean_df(dftrain)
dfeval_clean <- fun_clean_df(dfeval)
# Compare quantitative variables
pairs(dftrain[, c('price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking')])
corrplot(cor(dftrain[, c('price', 'area', 'bedrooms', 'bathrooms', 'stories', 'parking')], use = 'complete.obs'), tl.cex = 0.5)
# Compare nominal categorical variables to price
par(mfrow=c(3, 3))
boxplot(price ~ mainroad, data=dftrain)
boxplot(price ~ guestroom, data=dftrain)
boxplot(price ~ basement, data=dftrain)
boxplot(price ~ hotwaterheating, data=dftrain)
boxplot(price ~ airconditioning, data=dftrain)
boxplot(price ~ prefarea, data=dftrain)
boxplot(price ~ furnishingstatus, data=dftrain)
# Compare ordinal categorical variables
par(mfrow=c(2, 2))
boxplot(price ~ bedrooms, data=dftrain)
boxplot(price ~ bathrooms, data=dftrain)
boxplot(price ~ stories, data=dftrain)
boxplot(price ~ parking, data=dftrain)
# Draw a histogram to figure out the distribution of Sale Price
options(scipen=10000)
ggplot(dftrain_clean, aes(x = price, fill = ..count..)) +
geom_histogram() +
ggtitle("Figure 1 Histogram of Price") +
ylab("Count of houses") +
xlab("Housing Price") +
theme(plot.title = element_text(hjust = 0.9))
# corrplot
cor_res <- cor(dftrain_clean)
corrplot(cor_res,
type = "lower",
order = "original",
tl.col = "black",
tl.srt = 50,
tl.cex = 1)
# start of model building
#partition data
#set.seed(10000)
#train.index <- sample(c(1:dim(dftrain_clean)[1]), dim(dftrain_clean)[1]*0.8)
#model_lin_train = dftrain_clean[train.index,]
#model_lin_valid <- dftrain_clean[-train.index,]
model_lin_train <- dftrain_clean
# mlr 1
lm_mod1 <- lm(price ~., data = model_lin_train)
aic_lm_mod1 = AIC(lm_mod1)
summary(lm_mod1)
# mlr 2
lm_mod2 <- stepAIC(lm_mod1, trace = F)
aic_lm_mod2 = AIC(lm_mod2)
summary(lm_mod2)
# mlr 3
# reduce collinearity, and remove low values
lm_mod3 <- lm(price ~ area + guestroom + basement + bath2 + bath3 +
bath4plus + floor2 + floor3 + floor4plus + car1 + car2 +
car3plus + semifurnished + furnished + ac + neighborhood
- car3plus - bed2,
data = model_lin_train)
summary(lm_mod3)
# model selection
# Define function to calculate mean squared error
calc_mse <- function(lmod) {
return(mean((summary(lmod))$residuals ^ 2))
}
# Define function to aid in model analysis
ModelAnalysis <- function(lmod) {
# Plot residuals
print('--------------------------------------------------')
print(lmod$call)
par(mfrow=c(2,2))
plot(lmod)
print('')
# Shapiro test to determine normality of residuals
# Null hypothesis: the residuals are normal.
# If the p-value is small, reject the null, i.e., consider the residuals *not* normally distributed.
if (length(lmod$fitted.values) > 3 & length(lmod$fitted.values) < 5000) {
st <- shapiro.test(lmod$residuals)
if (st$p.value <= 0.05) {
print(paste0("Shapiro test for normality: The p-value of ", st$p.value, " is <= 0.05, so reject the null; i.e., the residuals are NOT NORMAL"))
} else {
print(paste0("Shapiro test for normality: The p-value of ", st$p.value, " is > 0.05, so do not reject the null; i.e., the residuals are NORMAL"))
}
print('')
} else {
print("Shapiro test for normality of residuals cannot be performed; sample length must be between 3 and 5000.")
}
# Breusch-Pagan test to determine homoschedasticity of residuals
# Null hypothesis: the residuals are homoschedastic.
# If the p-value is small, reject the null, i.e., consider the residuals heteroschedastic.
bp <- bptest(lmod)
if (bp$p.value > 0.05 & bp$statistic < 10) {
print(paste0("Breusch-Pagan test for homoschedasticity: The p-value of ", bp$p.value, " is > 0.05 and the test statistic of ", bp$statistic,
" is < 10, so don't reject the null; i.e., the residuals are HOMOSCHEDASTIC."))
} else if (bp$p.value <= 0.05) {
print(paste0("Breusch-Pagan test for homoschedasticity: The p-value of ", bp$p.value, " is <= 0.05 and the test statistic is ", bp$statistic,
", so reject the null; i.e., the residuals are HETEROSCHEDASTIC."))
} else {
print(paste0("Breusch-Pagan test for homoschedasticity: The p-value of ", bp$p.value, " and test statistic of ", bp$statistic,
" are inconclusive, so homoschedasticity can't be determined using this test. But since the p-value is > 0.05, ",
"it is reasonable to conclude that the residuals are HOMOSCHEDASTIC."))
}
print('')
# Visually look for colinearity - dont do this for large models
#pairs(model.matrix(lmod))
# Variance inflation factor (VIF)
print('Variance inflation factor (VIF)')
print('<=1: not correlated, 1-5: moderately correlated, >5: strongly correlated')
print(sort(vif(lmod), decreasing=T))
print('')
# Standardized residual plots (look for points outside of 2 or 3 stdev)
p <- length(summary(lmod)$coeff[,1] - 1) # number of model parameters
stanres <- rstandard(lmod)
for (i in seq(1, ceiling(p / 4))) {
par(mfrow=c(2,2))
starti <- ((i - 1) * 4) + 1
for (j in seq(starti, starti + 3)) {
if (j + 1 <= ncol(model.matrix(lmod))) {
# Skip these plots since we're pretty sure that a linear model isn't valid here
#plot(model.matrix(lmod)[, j + 1], stanres, xlab=colnames(model.matrix(lmod))[j + 1], ylab='Standardized residuals')
#abline(h=c(-2, 2), lt=3, col='blue')
#abline(h=c(-3, 3), lt=2, col='red')
}
}
}
# Model scores
print('Model scores:')
print(paste0(' adjusted R-squared: ', round(summary(lmod)$adj.r.squared, 3)))
print(paste0(' AIC: ', round(AIC(lmod, k=2), 3)))
print(paste0(' BIC: ', round(BIC(lmod), 3)))
print(paste0(' Mallow\'s Cp: ', round(ols_mallows_cp(lmod, fullmodel=lmod), 3)))
print(paste0(' mean squared error: ', round(calc_mse(lmod), 3)))
print('')
# Find leverage point cutoff
n <- length(lmod$residuals)
cutoff <- 2 * (p + 1) / n
print(paste0('Leverage point cutoff: ', cutoff))
print('')
# Show points of influence
print('First 10 points of influence:')
poi <- lm.influence(lmod)$hat
len_poi <- length(poi)
ct <- 0
for (i in seq(1, length(poi))) {
if (poi[i] > cutoff) {
ct <- ct + 1
print(paste0(' case #', i, ': ', round(poi[i], 3)))
}
if (ct > 10) {
break
}
}
print('')
}
# Analysis on the two step-reduced models
ModelAnalysis(lm_mod2)
ModelAnalysis(lm_mod3)
# Box-Cox transform on price
bc1 <- powerTransform(price ~ ., data=model_lin_train)
bc1
# Box-Cox result suggests doing a log transform on price
lm_mod4 <- lm(log(price) ~ ., data=model_lin_train)
summary(lm_mod4)
lm_mod5 <- stepAIC(lm_mod4, trace=F)
summary(lm_mod5)
ModelAnalysis(lm_mod5)
# Investigate top outliers
model_lin_train[c(2, 5, 9, 25, 49, 62, 77, 103, 110, 136, 210),]
# Log transform on price with outliers removed
lm_mod6 <- lm(formula(lm_mod5),
data = model_lin_train[c(-2, -5, -9, -25, -49, -62, -77, -103, -110, -136, -210),])
summary(lm_mod6)
lm_mod7 <- stepAIC(lm_mod6, trace=F)
summary(lm_mod7)
ModelAnalysis(lm_mod7)
# Huber robust linear regression
lm_mod8 <- rlm(formula(lm_mod7), data=model_lin_train)
dftmp <- data.frame(cbind(price=model_lin_train$price, huber_weight=lm_mod8$w))
dftmp <- dftmp %>% arrange(huber_weight, ascending=F)
hist(dftmp$huber_weight, xlab='Huber weight', main='Histogram of Huber Weights')
# New linear model using Huber weights
lm_mod9 <- lm(formula(lm_mod7), weights=lm_mod8$w, data=model_lin_train)
summary(lm_mod9)
ModelAnalysis(lm_mod9)
# Remove most significant outliers
lm_mod10 <- lm(formula(lm_mod7), weights=lm_mod8$w[c(-53, -68, -87, -90, -103)],
data=model_lin_train[c(-53, -68, -87, -90, -103),])
summary(lm_mod10)
ModelAnalysis(lm_mod10)
# Five-fold cross validation
set.seed(777)
tc <- trainControl(method = "cv", number = 5)
lmcv <- train(formula(lm_mod8), weights=lm_mod8$w, data=model_lin_train, method="lm", trControl=tc)
print(lmcv)
summary(lmcv)
# Huber robust linear regression
lm_valid1 <- lm(formula(lm_mod7), data=dfeval_clean)
# New linear model using Huber weights
lm_valid2 <- lm(formula(lm_mod7), weights=lm_valid1$w, data=dfeval_clean)
summary(lm_valid2)
ModelAnalysis(lm_valid2)
# Model comparison
hdr <- c('#', 'Train/Validation', 'Linear/Robust', 'Full/Step-reduced', 'Log Transform',
'Outliers Removed', 'Huber-Weighted', 'Adj R-Sqr')
f1 <- c(seq(1:11))
f2 <- c(rep('Train', 10), 'Validation')
f3 <- c(rep('Linear', 7), 'Robust', rep('Linear', 3))
f4 <- c('Full', 'Step', 'Step', 'Full', rep('Step', 7))
f5 <- c(rep('', 3), rep('Yes', 8))
f6 <- c(rep('', 5), 'Yes', 'Yes', '', '', 'Yes', '')
f7 <- c(rep('', 8), rep('Yes', 3))
f8 <- round(c(
summary(lm_mod1)$adj.r.squared,
summary(lm_mod2)$adj.r.squared,
summary(lm_mod3)$adj.r.squared,
summary(lm_mod4)$adj.r.squared,
summary(lm_mod5)$adj.r.squared,
summary(lm_mod6)$adj.r.squared,
summary(lm_mod7)$adj.r.squared,
NA,
summary(lm_mod9)$adj.r.squared,
summary(lm_mod10)$adj.r.squared,
summary(lm_valid2)$adj.r.squared), 3)
dfresult <- data.frame(f1, f2, f3, f4, f5, f6, f7, f8)
colnames(dfresult) <- hdr
dfresult
START OF OUR CODING HERE! WILL NOT SHOW FOR PDF VERSION
These are the libraries used to explore, prepare, analyze and build our models
library(tidyverse)
library(dplyr)
library(corrplot)
library(MASS)
library(dvmisc)
library(car)
library(lmtest)
library(olsrr)
library(caret)
We have included the original data sets in our GitHub account and read from this location. Since our data set doesn’t come with a training and evaluation data sets we will be splitting our data using the 70% - 30% split. Below we are showing the training data set:
## price area bedrooms bathrooms stories mainroad guestroom basement
## 1 13300000 7420 4 2 3 yes no no
## 2 12250000 8960 4 4 4 yes no no
## 3 12250000 9960 3 2 2 yes no yes
## 5 11410000 7420 4 1 2 yes yes yes
## 8 10150000 16200 5 3 2 yes no no
## 9 9870000 8100 4 1 2 yes yes yes
## hotwaterheating airconditioning parking prefarea furnishingstatus
## 1 no yes 2 yes furnished
## 2 no yes 3 no furnished
## 3 no no 2 yes semi-furnished
## 5 no yes 2 no furnished
## 8 no no 0 no unfurnished
## 9 no yes 2 yes furnished
\(~\)
Based on this our training data includes 386 records and 13 variables whereas the evaluation data includes 159 records and 13 variables.
Training:
## 'data.frame': 386 obs. of 13 variables:
## $ price : int 13300000 12250000 12250000 11410000 10150000 9870000 9800000 9800000 9681000 9310000 ...
## $ area : int 7420 8960 9960 7420 16200 8100 5750 13200 6000 6550 ...
## $ bedrooms : int 4 4 3 4 5 4 3 3 4 4 ...
## $ bathrooms : int 2 4 2 1 3 1 2 1 3 2 ...
## $ stories : int 3 4 2 2 2 2 4 2 2 2 ...
## $ mainroad : chr "yes" "yes" "yes" "yes" ...
## $ guestroom : chr "no" "no" "no" "yes" ...
## $ basement : chr "no" "no" "yes" "yes" ...
## $ hotwaterheating : chr "no" "no" "no" "no" ...
## $ airconditioning : chr "yes" "yes" "no" "yes" ...
## $ parking : int 2 3 2 2 0 2 1 2 2 1 ...
## $ prefarea : chr "yes" "no" "yes" "no" ...
## $ furnishingstatus: chr "furnished" "furnished" "semi-furnished" "furnished" ...
\(~\)
Evaluation:
## 'data.frame': 159 obs. of 13 variables:
## $ price : int 12215000 10850000 10150000 9240000 9100000 8960000 8855000 8750000 8400000 8120000 ...
## $ area : int 7500 7500 8580 7800 6600 8500 6420 4320 7950 6840 ...
## $ bedrooms : int 4 3 4 3 4 3 3 3 5 5 ...
## $ bathrooms : int 2 3 3 2 2 2 2 1 2 1 ...
## $ stories : int 2 1 4 2 2 4 2 2 2 2 ...
## $ mainroad : chr "yes" "yes" "yes" "yes" ...
## $ guestroom : chr "no" "no" "no" "no" ...
## $ basement : chr "yes" "yes" "no" "no" ...
## $ hotwaterheating : chr "no" "no" "no" "no" ...
## $ airconditioning : chr "yes" "yes" "yes" "no" ...
## $ parking : int 3 2 2 0 1 2 1 2 2 1 ...
## $ prefarea : chr "yes" "yes" "yes" "yes" ...
## $ furnishingstatus: chr "furnished" "semi-furnished" "semi-furnished" "semi-furnished" ...
\(~\)
Using the summary() function lets start exploring the
training and evaluation data.
Training:
## price area bedrooms bathrooms
## Min. : 1750000 Min. : 1650 Min. :1.000 Min. :1.00
## 1st Qu.: 3473750 1st Qu.: 3588 1st Qu.:2.000 1st Qu.:1.00
## Median : 4340000 Median : 4600 Median :3.000 Median :1.00
## Mean : 4763635 Mean : 5178 Mean :2.953 Mean :1.28
## 3rd Qu.: 5740000 3rd Qu.: 6360 3rd Qu.:3.000 3rd Qu.:2.00
## Max. :13300000 Max. :16200 Max. :6.000 Max. :4.00
## stories mainroad guestroom basement
## Min. :1.000 Length:386 Length:386 Length:386
## 1st Qu.:1.000 Class :character Class :character Class :character
## Median :2.000 Mode :character Mode :character Mode :character
## Mean :1.793
## 3rd Qu.:2.000
## Max. :4.000
## hotwaterheating airconditioning parking prefarea
## Length:386 Length:386 Min. :0.000 Length:386
## Class :character Class :character 1st Qu.:0.000 Class :character
## Mode :character Mode :character Median :0.000 Mode :character
## Mean :0.715
## 3rd Qu.:1.000
## Max. :3.000
## furnishingstatus
## Length:386
## Class :character
## Mode :character
##
##
##
\(~\)
Evaluation:
## price area bedrooms bathrooms
## Min. : 1767150 Min. : 1836 Min. :1.000 Min. :1.000
## 1st Qu.: 3430000 1st Qu.: 3600 1st Qu.:3.000 1st Qu.:1.000
## Median : 4270000 Median : 4500 Median :3.000 Median :1.000
## Mean : 4774240 Mean : 5083 Mean :2.994 Mean :1.302
## 3rd Qu.: 5771500 3rd Qu.: 6450 3rd Qu.:3.000 3rd Qu.:2.000
## Max. :12215000 Max. :12944 Max. :5.000 Max. :3.000
## stories mainroad guestroom basement
## Min. :1.000 Length:159 Length:159 Length:159
## 1st Qu.:1.000 Class :character Class :character Class :character
## Median :2.000 Mode :character Mode :character Mode :character
## Mean :1.836
## 3rd Qu.:2.000
## Max. :4.000
## hotwaterheating airconditioning parking prefarea
## Length:159 Length:159 Min. :0.0000 Length:159
## Class :character Class :character 1st Qu.:0.0000 Class :character
## Mode :character Mode :character Median :0.0000 Mode :character
## Mean :0.6415
## 3rd Qu.:1.0000
## Max. :3.0000
## furnishingstatus
## Length:159
## Class :character
## Mode :character
##
##
##
It is important to recognize that this dataset contains homes with prices above 1 million. It is not clear that this is a US dataset, which would indicate that this is for luxury homes and/or high value markets.
The area variable appears to be square footage of the home. We would traditionally expect that increases in area would lead to increases in price.
While we expect increases in the number of bedrooms to increase the price, we also realize that at some point there are diminishing returns that an additional bedroom doesn’t have as much of an impact. For example, increasing from one to two bedrooms should have significant increase in price, while increasing from four to five, perhaps not so much.
## bedrooms n
## 1 1 1
## 2 2 102
## 3 3 207
## 4 4 68
## 5 5 6
## 6 6 2
Based on the distribution of the number of Bedrooms, it may be best to categorize these with dummy variables; 2, 3, and 4+.
Similar to the number of bedrooms, we would expect that an increase in bathroom count would lead to increases in price. Although similarly, having more than four bathrooms is likely going to lead to smaller increases.
## bathrooms n
## 1 1 288
## 2 2 89
## 3 3 8
## 4 4 1
Based on the distribution of the number of bathrooms, it may be best to categorize these with dummy variables; 2, and 3+.
Similar to the number of bedrooms and bathrooms, it would seem to make sense to classify homes with 3 or more floors together by introducing dummy variables; 2, and 3+.
## stories n
## 1 1 169
## 2 2 161
## 3 3 23
## 4 4 33
We are assuming that the parking variable represents the size of a garage. Similar to other variable the increase in price from no garage to a one car garage would be significant, while additional cars would add some lesser value. It would initially seem to make sense to introduce dummy variables; 1, and 2+.
## parking n
## 1 0 203
## 2 1 97
## 3 2 79
## 4 3 7
The furnishing status variable is taking on three values; unfurnished, semi-furnished, and furnished. Since we would consider unfurnished as the default state, we will use dummy variables; semi-furnished and furnished.
## furnishingstatus n
## 1 furnished 103
## 2 semi-furnished 160
## 3 unfurnished 123
The main road variable is yes/no based on the street of the home. We will replace this with a dummy variable.
## mainroad n
## 1 no 50
## 2 yes 336
The guest room variable is yes/no based on the home having a guest room. It is unclear from the dataset source if this is in addition to the number of bedrooms, but we would expect houses with a guest room to have a higher price. We will replace this with a dummy variable.
## guestroom n
## 1 no 312
## 2 yes 74
The basement variable is yes/no based on the home having a basement. It is unclear if having a basement or not would lead to an increase in home price, but we will replace this with a dummy variable for analysis.
## basement n
## 1 no 249
## 2 yes 137
Based on the distribution, we assume that the hot water heating variable represents if the house has in-floor heating, rather than forced air. Based on this assumption, we assume that having this feature would lead to higher house price. The variable will be replaced with a dummy variable for analysis.
## hotwaterheating n
## 1 no 366
## 2 yes 20
The air conditioning variable indicates if the house has central air conditioning. We would expect homes with air conditioning would have a higher price than those without. The variable will be replaced with a dummy variable.
## airconditioning n
## 1 no 264
## 2 yes 122
The dataset source doesn’t specify exactly what this variable represents. We are assuming that this is a yes/no value if the house is in a preferred neighborhood. We would expect houses with a yes to be higher price than those not.
## prefarea n
## 1 no 298
## 2 yes 88
Based on our exploration, we do not have any blank values in our dataset.
We will introduce a clean function to replace our categorical variables with the dummy values. This will also ensure that our test and train datasets are processed in the same way.
Visual evaluation:
# Draw a histogram to figure out the distribution of Sale Price
options(scipen=10000)
ggplot(dftrain_clean, aes(x = price, fill = ..count..)) +
geom_histogram() +
ggtitle("Figure 1 Histogram of Price") +
ylab("Count of houses") +
xlab("Housing Price") +
theme(plot.title = element_text(hjust = 0.9))
After cleaning the dataset looking at a correlation plot will give us confirmation about our initial examination for the variables.
cor_res <- cor(dftrain_clean)
corrplot(cor_res,
type = "lower",
order = "original",
tl.col = "black",
tl.srt = 50,
tl.cex = 1)
The correlation plot generally confirms our initial expectations for the data.
\(~\)
#partition data
#set.seed(10000)
#train.index <- sample(c(1:dim(dftrain_clean)[1]), dim(dftrain_clean)[1]*0.8)
#model_lin_train = dftrain_clean[train.index,]
#model_lin_valid <- dftrain_clean[-train.index,]
model_lin_train <- dftrain_clean
\(~\)
lm_mod1 <- lm(price ~., data = model_lin_train)
aic_lm_mod1 = AIC(lm_mod1)
summary(lm_mod1)
##
## Call:
## lm(formula = price ~ ., data = model_lin_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2778242 -618889 -69359 502507 5058478
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1328656.61 1089042.62 1.220 0.223251
## area 259.46 29.19 8.889 < 0.0000000000000002 ***
## mainroad 392612.63 177649.27 2.210 0.027727 *
## guestroom 387128.55 154049.10 2.513 0.012404 *
## basement 340379.30 132393.80 2.571 0.010540 *
## hotwaterheating 454981.22 254142.03 1.790 0.074247 .
## bed2 -135521.62 1079765.76 -0.126 0.900189
## bed3 84970.58 1084129.57 0.078 0.937572
## bed4 294373.83 1095943.45 0.269 0.788388
## bed5 349023.44 1186154.54 0.294 0.768737
## bed6plus 822411.56 1320048.36 0.623 0.533666
## bath2 823033.31 148991.77 5.524 0.0000000634 ***
## bath3 1711486.52 412100.64 4.153 0.0000409553 ***
## bath4plus 5939390.79 1173229.90 5.062 0.0000006603 ***
## floor2 369286.09 145200.74 2.543 0.011397 *
## floor3 917915.07 262415.57 3.498 0.000527 ***
## floor4plus 1368891.57 247650.70 5.528 0.0000000623 ***
## car1 350724.78 139562.29 2.513 0.012404 *
## car2 597602.32 154287.27 3.873 0.000127 ***
## car3plus -694646.46 454023.80 -1.530 0.126896
## semifurnished 386745.44 133594.69 2.895 0.004023 **
## furnished 533608.97 151214.70 3.529 0.000471 ***
## ac 762389.97 133720.79 5.701 0.0000000247 ***
## neighborhood 666169.08 141563.01 4.706 0.0000036049 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1066000 on 362 degrees of freedom
## Multiple R-squared: 0.6949, Adjusted R-squared: 0.6755
## F-statistic: 35.85 on 23 and 362 DF, p-value: < 0.00000000000000022
\(~\)
lm_mod2 <- stepAIC(lm_mod1, trace = F)
aic_lm_mod2 = AIC(lm_mod2)
summary(lm_mod2)
##
## Call:
## lm(formula = price ~ area + mainroad + guestroom + basement +
## hotwaterheating + bed2 + bath2 + bath3 + bath4plus + floor2 +
## floor3 + floor4plus + car1 + car2 + car3plus + semifurnished +
## furnished + ac + neighborhood, data = model_lin_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2606156 -623517 -76743 477682 5170369
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1427060.25 221172.14 6.452 0.00000000035 ***
## area 262.55 28.72 9.142 < 0.0000000000000002 ***
## mainroad 371018.49 174144.11 2.131 0.033795 *
## guestroom 388933.37 153592.89 2.532 0.011751 *
## basement 327059.67 131426.10 2.489 0.013271 *
## hotwaterheating 440059.27 252885.91 1.740 0.082673 .
## bed2 -242263.06 151828.03 -1.596 0.111432
## bath2 871714.49 145028.40 6.011 0.00000000447 ***
## bath3 1782594.89 398633.12 4.472 0.00001036418 ***
## bath4plus 6100918.97 1164426.07 5.239 0.00000027257 ***
## floor2 427081.88 139028.59 3.072 0.002286 **
## floor3 944710.16 259836.19 3.636 0.000317 ***
## floor4plus 1394855.82 245550.02 5.681 0.00000002741 ***
## car1 354677.24 138326.44 2.564 0.010744 *
## car2 606621.41 153181.86 3.960 0.00009002748 ***
## car3plus -694809.66 452826.31 -1.534 0.125799
## semifurnished 390379.23 132434.98 2.948 0.003407 **
## furnished 540998.79 150081.46 3.605 0.000356 ***
## ac 757477.86 133121.53 5.690 0.00000002604 ***
## neighborhood 661882.68 141156.08 4.689 0.00000388102 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1064000 on 366 degrees of freedom
## Multiple R-squared: 0.6927, Adjusted R-squared: 0.6767
## F-statistic: 43.42 on 19 and 366 DF, p-value: < 0.00000000000000022
\(~\)
##
## Call:
## lm(formula = price ~ area + guestroom + basement + bath2 + bath3 +
## bath4plus + floor2 + floor3 + floor4plus + car1 + car2 +
## car3plus + semifurnished + furnished + ac + neighborhood -
## car3plus - bed2, data = model_lin_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2649096 -673238 -43507 477530 5001125
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1503879.37 170520.37 8.819 < 0.0000000000000002 ***
## area 270.76 27.87 9.715 < 0.0000000000000002 ***
## guestroom 408948.95 154956.82 2.639 0.008664 **
## basement 355323.88 131895.11 2.694 0.007382 **
## bath2 890196.07 145115.53 6.134 0.00000000220 ***
## bath3 1780896.41 401261.08 4.438 0.00001198413 ***
## bath4plus 5494328.88 1105325.41 4.971 0.00000102187 ***
## floor2 550003.46 123318.63 4.460 0.00001088514 ***
## floor3 1161098.75 249130.18 4.661 0.00000440677 ***
## floor4plus 1495717.83 237291.27 6.303 0.00000000083 ***
## car1 395580.85 136995.37 2.888 0.004111 **
## car2 706158.82 151485.48 4.662 0.00000438759 ***
## semifurnished 421633.75 132631.94 3.179 0.001602 **
## furnished 569389.00 150773.07 3.776 0.000185 ***
## ac 760707.49 132467.98 5.743 0.00000001947 ***
## neighborhood 698432.62 139650.80 5.001 0.00000088145 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1074000 on 370 degrees of freedom
## Multiple R-squared: 0.6831, Adjusted R-squared: 0.6703
## F-statistic: 53.18 on 15 and 370 DF, p-value: < 0.00000000000000022
\(~\)
Verify linear modeling assumptions:
## [1] "--------------------------------------------------"
## lm(formula = price ~ area + mainroad + guestroom + basement +
## hotwaterheating + bed2 + bath2 + bath3 + bath4plus + floor2 +
## floor3 + floor4plus + car1 + car2 + car3plus + semifurnished +
## furnished + ac + neighborhood, data = model_lin_train)
## [1] ""
## [1] "Shapiro test for normality: The p-value of 0.0000000000102740426604147 is <= 0.05, so reject the null; i.e., the residuals are NOT NORMAL"
## [1] ""
## [1] "Breusch-Pagan test for homoschedasticity: The p-value of 0.0000153934723145733 is <= 0.05 and the test statistic is 56.1639472213671, so reject the null; i.e., the residuals are HETEROSCHEDASTIC."
## [1] ""
## [1] "Variance inflation factor (VIF)"
## [1] "<=1: not correlated, 1-5: moderately correlated, >5: strongly correlated"
## floor4plus floor2 bed2 furnished semifurnished
## 1.607734 1.602737 1.528509 1.502877 1.451709
## area basement ac car2 floor3
## 1.372255 1.348742 1.306490 1.302644 1.290267
## bath2 guestroom car3plus car1 neighborhood
## 1.272619 1.246735 1.245221 1.227795 1.196035
## bath4plus mainroad bath3 hotwaterheating
## 1.194896 1.166199 1.099953 1.071534
## [1] ""
## [1] "Model scores:"
## [1] " adjusted R-squared: 0.677"
## [1] " AIC: 11830.246"
## [1] " BIC: 11913.319"
## [1] " Mallow's Cp: 20"
## [1] " mean squared error: 1073150824664.42"
## [1] ""
## [1] "Leverage point cutoff: 0.10880829015544"
## [1] ""
## [1] "First 10 points of influence:"
## [1] " case #2: 1"
## [1] " case #5: 0.219"
## [1] " case #9: 0.184"
## [1] " case #25: 0.155"
## [1] " case #33: 0.209"
## [1] " case #49: 0.128"
## [1] " case #62: 0.149"
## [1] " case #103: 0.133"
## [1] " case #110: 0.15"
## [1] " case #136: 0.149"
## [1] " case #156: 0.201"
## [1] ""
## [1] "--------------------------------------------------"
## lm(formula = price ~ area + guestroom + basement + bath2 + bath3 +
## bath4plus + floor2 + floor3 + floor4plus + car1 + car2 +
## car3plus + semifurnished + furnished + ac + neighborhood -
## car3plus - bed2, data = model_lin_train)
## [1] ""
## [1] "Shapiro test for normality: The p-value of 0.0000000000133366117207675 is <= 0.05, so reject the null; i.e., the residuals are NOT NORMAL"
## [1] ""
## [1] "Breusch-Pagan test for homoschedasticity: The p-value of 0.000000924975310233364 is <= 0.05 and the test statistic is 56.6934694356472, so reject the null; i.e., the residuals are HETEROSCHEDASTIC."
## [1] ""
## [1] "Variance inflation factor (VIF)"
## [1] "<=1: not correlated, 1-5: moderately correlated, >5: strongly correlated"
## furnished floor4plus semifurnished basement ac
## 1.487044 1.471990 1.427504 1.331772 1.268347
## area bath2 car2 guestroom floor2
## 1.267011 1.249185 1.248993 1.244114 1.236285
## car1 floor3 neighborhood bath3 bath4plus
## 1.180685 1.162893 1.147727 1.092669 1.055586
## [1] ""
## [1] "Model scores:"
## [1] " adjusted R-squared: 0.67"
## [1] " AIC: 11834.079"
## [1] " BIC: 11901.328"
## [1] " Mallow's Cp: 16"
## [1] " mean squared error: 1106558754047.73"
## [1] ""
## [1] "Leverage point cutoff: 0.0880829015544041"
## [1] ""
## [1] "First 10 points of influence:"
## [1] " case #2: 1"
## [1] " case #5: 0.215"
## [1] " case #9: 0.143"
## [1] " case #25: 0.15"
## [1] " case #62: 0.148"
## [1] " case #110: 0.146"
## [1] " case #136: 0.146"
## [1] " case #210: 0.144"
## [1] " case #232: 0.091"
## [1] " case #355: 0.15"
## [1] ""
\(~\)
Due to non-normal distribution and heteroschedasticity of residuals, try a transform. Use Box-Cox to estimate what kind of transform is appropriate.
# Box-Cox transform on price
bc1 <- powerTransform(price ~ ., data=model_lin_train)
bc1
## Estimated transformation parameter
## Y1
## 0.08222126
# Box-Cox result suggests doing a log transform on price
lm_mod4 <- lm(log(price) ~ ., data=model_lin_train)
summary(lm_mod4)
##
## Call:
## lm(formula = log(price) ~ ., data = model_lin_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58768 -0.12578 -0.00157 0.13060 0.60404
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.544518392 0.212111659 68.570 < 0.0000000000000002 ***
## area 0.000050911 0.000005685 8.956 < 0.0000000000000002 ***
## mainroad 0.106432736 0.034600557 3.076 0.002257 **
## guestroom 0.072860681 0.030003978 2.428 0.015653 *
## basement 0.092487862 0.025786197 3.587 0.000381 ***
## hotwaterheating 0.085236067 0.049498970 1.722 0.085928 .
## bed2 -0.017549893 0.210304814 -0.083 0.933540
## bed3 0.058477567 0.211154749 0.277 0.781983
## bed4 0.083540318 0.213455726 0.391 0.695754
## bed5 0.091411039 0.231026043 0.396 0.692578
## bed6plus 0.256317014 0.257104398 0.997 0.319461
## bath2 0.150431917 0.029018967 5.184 0.00000036205 ***
## bath3 0.302837566 0.080264398 3.773 0.000188 ***
## bath4plus 0.684383498 0.228508725 2.995 0.002933 **
## floor2 0.057366590 0.028280592 2.028 0.043243 *
## floor3 0.196068831 0.051110399 3.836 0.000147 ***
## floor4plus 0.264755508 0.048234660 5.489 0.00000007621 ***
## car1 0.071065993 0.027182396 2.614 0.009311 **
## car2 0.092977637 0.030050365 3.094 0.002128 **
## car3plus -0.109608169 0.088429727 -1.239 0.215965
## semifurnished 0.144544389 0.026020095 5.555 0.00000005383 ***
## furnished 0.136319806 0.029451924 4.629 0.00000513855 ***
## ac 0.154534113 0.026044654 5.933 0.00000000694 ***
## neighborhood 0.127050640 0.027572075 4.608 0.00000564357 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2076 on 362 degrees of freedom
## Multiple R-squared: 0.7091, Adjusted R-squared: 0.6906
## F-statistic: 38.37 on 23 and 362 DF, p-value: < 0.00000000000000022
lm_mod5 <- stepAIC(lm_mod4, trace=F)
summary(lm_mod5)
##
## Call:
## lm(formula = log(price) ~ area + mainroad + guestroom + basement +
## hotwaterheating + bed3 + bed4 + bed6plus + bath2 + bath3 +
## bath4plus + floor2 + floor3 + floor4plus + car1 + car2 +
## semifurnished + furnished + ac + neighborhood, data = model_lin_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58509 -0.12700 0.00047 0.12936 0.60034
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.535084196 0.040152865 361.994 < 0.0000000000000002 ***
## area 0.000050982 0.000005498 9.273 < 0.0000000000000002 ***
## mainroad 0.098425270 0.033966231 2.898 0.003985 **
## guestroom 0.073094605 0.029965558 2.439 0.015192 *
## basement 0.094260108 0.025696821 3.668 0.000281 ***
## hotwaterheating 0.088006202 0.049413883 1.781 0.075744 .
## bed3 0.064144656 0.028343442 2.263 0.024215 *
## bed4 0.085823272 0.039599484 2.167 0.030859 *
## bed6plus 0.263077810 0.150898238 1.743 0.082104 .
## bath2 0.153013605 0.028971974 5.281 0.00000022073 ***
## bath3 0.326495664 0.077959203 4.188 0.00003531575 ***
## bath4plus 0.586803921 0.215060901 2.729 0.006669 **
## floor2 0.066285069 0.027306452 2.427 0.015688 *
## floor3 0.208868710 0.050236039 4.158 0.00004009382 ***
## floor4plus 0.265805557 0.047723404 5.570 0.00000004958 ***
## car1 0.071743162 0.026735578 2.683 0.007619 **
## car2 0.097614195 0.029751245 3.281 0.001134 **
## semifurnished 0.142899486 0.025928567 5.511 0.00000006745 ***
## furnished 0.136023794 0.029314267 4.640 0.00000485991 ***
## ac 0.156272867 0.026009414 6.008 0.00000000454 ***
## neighborhood 0.128037921 0.027532296 4.650 0.00000463690 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2076 on 365 degrees of freedom
## Multiple R-squared: 0.7068, Adjusted R-squared: 0.6907
## F-statistic: 43.99 on 20 and 365 DF, p-value: < 0.00000000000000022
ModelAnalysis(lm_mod5)
## [1] "--------------------------------------------------"
## lm(formula = log(price) ~ area + mainroad + guestroom + basement +
## hotwaterheating + bed3 + bed4 + bed6plus + bath2 + bath3 +
## bath4plus + floor2 + floor3 + floor4plus + car1 + car2 +
## semifurnished + furnished + ac + neighborhood, data = model_lin_train)
## Warning: not plotting observations with leverage one:
## 2
## [1] ""
## [1] "Shapiro test for normality: The p-value of 0.0274288341817099 is <= 0.05, so reject the null; i.e., the residuals are NOT NORMAL"
## [1] ""
## [1] "Breusch-Pagan test for homoschedasticity: The p-value of 0.221006362177854 and test statistic of 24.5055362047153 are inconclusive, so homoschedasticity can't be determined using this test. But since the p-value is > 0.05, it is reasonable to conclude that the residuals are HOMOSCHEDASTIC."
## [1] ""
## [1] "Variance inflation factor (VIF)"
## [1] "<=1: not correlated, 1-5: moderately correlated, >5: strongly correlated"
## bed4 bed3 floor2 floor4plus furnished
## 2.039063 1.789964 1.624256 1.595391 1.506251
## semifurnished basement bath2 area ac
## 1.461844 1.354548 1.334192 1.321166 1.310209
## car2 floor3 guestroom car1 neighborhood
## 1.290896 1.267012 1.246656 1.204937 1.195362
## mainroad bath3 hotwaterheating bath4plus bed6plus
## 1.165520 1.105178 1.074793 1.070778 1.051587
## [1] ""
## [1] "Model scores:"
## [1] " adjusted R-squared: 0.691"
## [1] " AIC: -96.006"
## [1] " BIC: -8.978"
## [1] " Mallow's Cp: 21"
## [1] " mean squared error: 0.041"
## [1] ""
## [1] "Leverage point cutoff: 0.113989637305699"
## [1] ""
## [1] "First 10 points of influence:"
## [1] " case #2: 1"
## [1] " case #5: 0.232"
## [1] " case #9: 0.193"
## [1] " case #25: 0.159"
## [1] " case #49: 0.124"
## [1] " case #62: 0.163"
## [1] " case #77: 0.515"
## [1] " case #103: 0.153"
## [1] " case #110: 0.155"
## [1] " case #136: 0.157"
## [1] " case #210: 0.165"
## [1] ""
\(~\)
Investigate outliers.
# Investigate top outliers
model_lin_train[c(2, 5, 9, 25, 49, 62, 77, 103, 110, 136, 210),]
## price area mainroad guestroom basement hotwaterheating bed2 bed3 bed4
## 2 12250000 8960 1 0 0 0 0 0 1
## 8 10150000 16200 1 0 0 0 0 0 0
## 12 9681000 6000 1 1 1 1 0 0 1
## 34 8190000 5960 1 1 1 0 0 1 0
## 67 6930000 13200 1 0 1 1 1 0 0
## 90 6440000 8580 1 0 0 0 0 0 0
## 113 6083000 4300 1 0 0 0 0 0 0
## 144 5600000 4800 0 0 1 1 0 0 0
## 154 5530000 3300 1 0 1 0 0 1 0
## 196 4970000 4410 1 0 1 0 0 0 1
## 291 4200000 2610 0 0 0 0 0 0 1
## bed5 bed6plus bath2 bath3 bath4plus floor2 floor3 floor4plus car1 car2
## 2 0 0 0 0 1 0 0 1 0 0
## 8 1 0 0 1 0 1 0 0 0 0
## 12 0 0 0 1 0 1 0 0 0 1
## 34 0 0 0 1 0 1 0 0 1 0
## 67 0 0 0 0 0 0 0 0 1 0
## 90 1 0 0 1 0 1 0 0 0 1
## 113 0 1 1 0 0 1 0 0 0 0
## 144 1 0 1 0 0 0 1 0 0 0
## 154 0 0 0 1 0 1 0 0 0 0
## 196 0 0 0 1 0 1 0 0 0 1
## 291 0 0 0 1 0 1 0 0 0 0
## car3plus semifurnished furnished ac neighborhood
## 2 1 0 1 1 0
## 8 0 0 0 0 0
## 12 0 1 0 0 0
## 34 0 0 0 0 0
## 67 0 0 1 0 0
## 90 0 0 1 0 0
## 113 0 0 1 0 0
## 144 0 0 0 0 0
## 154 0 1 0 0 0
## 196 0 1 0 0 0
## 291 0 1 0 0 0
\(~\)
Investigation of outliers reveals no obvious pattern, so we have to assume there is some other variable at play that we don’t have data for (e.g. high-end appliances, presence of a pool, property condition, etc). Well’ remove the outliers and re-run model.
# Log transform on price with outliers removed
lm_mod6 <- lm(formula(lm_mod5),
data = model_lin_train[c(-2, -5, -9, -25, -49, -62, -77, -103, -110, -136, -210),])
summary(lm_mod6)
##
## Call:
## lm(formula = formula(lm_mod5), data = model_lin_train[c(-2, -5,
## -9, -25, -49, -62, -77, -103, -110, -136, -210), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58650 -0.12611 -0.00086 0.12165 0.62001
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.552077340 0.041028854 354.679 < 0.0000000000000002 ***
## area 0.000046391 0.000005973 7.767 0.0000000000000873 ***
## mainroad 0.094132064 0.034584153 2.722 0.006812 **
## guestroom 0.078800876 0.030481181 2.585 0.010130 *
## basement 0.085255655 0.026311384 3.240 0.001307 **
## hotwaterheating 0.057759412 0.052990633 1.090 0.276456
## bed3 0.078248863 0.029177990 2.682 0.007665 **
## bed4 0.102622190 0.040833608 2.513 0.012407 *
## bed6plus 0.120709432 0.210035997 0.575 0.565853
## bath2 0.147071512 0.029194002 5.038 0.0000007515098156 ***
## bath3 -0.142253973 0.212242318 -0.670 0.503139
## bath4plus NA NA NA NA
## floor2 0.058563926 0.027551773 2.126 0.034227 *
## floor3 0.191284890 0.051435131 3.719 0.000232 ***
## floor4plus 0.260516774 0.047628306 5.470 0.0000000851899506 ***
## car1 0.075782849 0.027081148 2.798 0.005417 **
## car2 0.104147967 0.030074398 3.463 0.000599 ***
## semifurnished 0.150000199 0.026299921 5.703 0.0000000247524034 ***
## furnished 0.140301991 0.029667427 4.729 0.0000032579008657 ***
## ac 0.159133762 0.025897699 6.145 0.0000000021516006 ***
## neighborhood 0.132549803 0.027450937 4.829 0.0000020472915666 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2063 on 355 degrees of freedom
## Multiple R-squared: 0.6997, Adjusted R-squared: 0.6836
## F-statistic: 43.53 on 19 and 355 DF, p-value: < 0.00000000000000022
lm_mod7 <- stepAIC(lm_mod6, trace=F)
summary(lm_mod7)
##
## Call:
## lm(formula = log(price) ~ area + mainroad + guestroom + basement +
## bed3 + bed4 + bath2 + floor2 + floor3 + floor4plus + car1 +
## car2 + semifurnished + furnished + ac + neighborhood, data = model_lin_train[c(-2,
## -5, -9, -25, -49, -62, -77, -103, -110, -136, -210), ])
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.58873 -0.12342 0.00127 0.12844 0.67130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.552303788 0.040688236 357.654 < 0.0000000000000002 ***
## area 0.000045884 0.000005951 7.710 0.000000000000125 ***
## mainroad 0.097414780 0.034245452 2.845 0.004702 **
## guestroom 0.076896113 0.030070499 2.557 0.010964 *
## basement 0.085065566 0.026142098 3.254 0.001246 **
## bed3 0.078025035 0.028949410 2.695 0.007366 **
## bed4 0.100743111 0.040624077 2.480 0.013602 *
## bath2 0.149409677 0.029079309 5.138 0.000000457356246 ***
## floor2 0.059460729 0.027314842 2.177 0.030143 *
## floor3 0.195417664 0.051220442 3.815 0.000160 ***
## floor4plus 0.259324130 0.047494588 5.460 0.000000089151967 ***
## car1 0.081547827 0.026701058 3.054 0.002426 **
## car2 0.106210924 0.029971879 3.544 0.000447 ***
## semifurnished 0.152667330 0.026031467 5.865 0.000000010235661 ***
## furnished 0.141615566 0.029508058 4.799 0.000002343150712 ***
## ac 0.155994408 0.025661908 6.079 0.000000003102744 ***
## neighborhood 0.130735463 0.027358190 4.779 0.000002579879496 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.206 on 358 degrees of freedom
## Multiple R-squared: 0.698, Adjusted R-squared: 0.6845
## F-statistic: 51.71 on 16 and 358 DF, p-value: < 0.00000000000000022
ModelAnalysis(lm_mod7)
## [1] "--------------------------------------------------"
## lm(formula = log(price) ~ area + mainroad + guestroom + basement +
## bed3 + bed4 + bath2 + floor2 + floor3 + floor4plus + car1 +
## car2 + semifurnished + furnished + ac + neighborhood, data = model_lin_train[c(-2,
## -5, -9, -25, -49, -62, -77, -103, -110, -136, -210), ])
## [1] ""
## [1] "Shapiro test for normality: The p-value of 0.0135819482901162 is <= 0.05, so reject the null; i.e., the residuals are NOT NORMAL"
## [1] ""
## [1] "Breusch-Pagan test for homoschedasticity: The p-value of 0.0840661091923266 and test statistic of 24.2557228274724 are inconclusive, so homoschedasticity can't be determined using this test. But since the p-value is > 0.05, it is reasonable to conclude that the residuals are HOMOSCHEDASTIC."
## [1] ""
## [1] "Variance inflation factor (VIF)"
## [1] "<=1: not correlated, 1-5: moderately correlated, >5: strongly correlated"
## bed4 bed3 floor2 floor4plus furnished
## 2.063505 1.834765 1.591989 1.555359 1.494596
## semifurnished area basement bath2 car2
## 1.454334 1.382581 1.372272 1.331000 1.282363
## floor3 ac guestroom car1 neighborhood
## 1.279918 1.271438 1.239238 1.191343 1.187511
## mainroad
## 1.156360
## [1] ""
## [1] "Model scores:"
## [1] " adjusted R-squared: 0.684"
## [1] " AIC: -101.985"
## [1] " BIC: -31.301"
## [1] " Mallow's Cp: 17"
## [1] " mean squared error: 0.041"
## [1] ""
## [1] "Leverage point cutoff: 0.096"
## [1] ""
## [1] "First 10 points of influence:"
## [1] " case #48: 0.104"
## [1] " case #62: 0.102"
## [1] " case #80: 0.099"
## [1] " case #84: 0.098"
## [1] " case #221: 0.116"
## [1] ""
\(~\)
Residuals are still not normally distributed. Use robust regression to try addressing non-normality.
# Huber robust linear regression
lm_mod8 <- rlm(formula(lm_mod7), data=model_lin_train)
dftmp <- data.frame(cbind(price=model_lin_train$price, huber_weight=lm_mod8$w))
dftmp <- dftmp %>% arrange(huber_weight, ascending=F)
hist(dftmp$huber_weight, xlab='Huber weight', main='Histogram of Huber Weights')
# New linear model using Huber weights
lm_mod9 <- lm(formula(lm_mod7), weights=lm_mod8$w, data=model_lin_train)
summary(lm_mod9)
##
## Call:
## lm(formula = formula(lm_mod7), data = model_lin_train, weights = lm_mod8$w)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.39873 -0.13163 0.00627 0.12850 0.41563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.576837195 0.036640378 397.835 < 0.0000000000000002 ***
## area 0.000049910 0.000005072 9.840 < 0.0000000000000002 ***
## mainroad 0.084162100 0.030798460 2.733 0.006585 **
## guestroom 0.066825026 0.027117946 2.464 0.014186 *
## basement 0.092910569 0.023534417 3.948 0.00009443484523 ***
## bed3 0.047228877 0.025581852 1.846 0.065666 .
## bed4 0.071650779 0.035800323 2.001 0.046081 *
## bath2 0.145767340 0.026146814 5.575 0.00000004789330 ***
## floor2 0.087224376 0.024366301 3.580 0.000390 ***
## floor3 0.228322844 0.045009601 5.073 0.00000062234166 ***
## floor4plus 0.304215596 0.042847379 7.100 0.00000000000645 ***
## car1 0.061013969 0.024105545 2.531 0.011785 *
## car2 0.093803011 0.027068547 3.465 0.000592 ***
## semifurnished 0.136384089 0.023452238 5.815 0.00000001311493 ***
## furnished 0.139402409 0.026699824 5.221 0.00000029770305 ***
## ac 0.142091507 0.023231442 6.116 0.00000000244205 ***
## neighborhood 0.127632227 0.024725726 5.162 0.00000040033935 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1835 on 369 degrees of freedom
## Multiple R-squared: 0.7219, Adjusted R-squared: 0.7098
## F-statistic: 59.86 on 16 and 369 DF, p-value: < 0.00000000000000022
ModelAnalysis(lm_mod9)
## [1] "--------------------------------------------------"
## lm(formula = formula(lm_mod7), data = model_lin_train, weights = lm_mod8$w)
## [1] ""
## [1] "Shapiro test for normality: The p-value of 0.00149841137153798 is <= 0.05, so reject the null; i.e., the residuals are NOT NORMAL"
## [1] ""
## [1] "Breusch-Pagan test for homoschedasticity: The p-value of 0.726046019909312 and test statistic of 12.2578680885363 are inconclusive, so homoschedasticity can't be determined using this test. But since the p-value is > 0.05, it is reasonable to conclude that the residuals are HOMOSCHEDASTIC."
## [1] ""
## [1] "Variance inflation factor (VIF)"
## [1] "<=1: not correlated, 1-5: moderately correlated, >5: strongly correlated"
## bed4 bed3 floor4plus floor2 furnished
## 1.974297 1.763977 1.576573 1.555003 1.497120
## semifurnished basement bath2 area car2
## 1.454589 1.369392 1.321515 1.307867 1.268015
## ac floor3 guestroom neighborhood car1
## 1.266215 1.243657 1.227593 1.190301 1.183226
## mainroad
## 1.165607
## [1] ""
## [1] "Model scores:"
## [1] " adjusted R-squared: 0.71"
## [1] " AIC: -167.558"
## [1] " BIC: -96.353"
## [1] " Mallow's Cp: 158.404"
## [1] " mean squared error: 0.032"
## [1] ""
## [1] "Leverage point cutoff: 0.0932642487046632"
## [1] ""
## [1] "First 10 points of influence:"
## [1] " case #53: 0.103"
## [1] " case #68: 0.102"
## [1] " case #87: 0.097"
## [1] " case #90: 0.094"
## [1] " case #103: 0.122"
## [1] ""
# Remove most significant outliers
lm_mod10 <- lm(formula(lm_mod7), weights=lm_mod8$w[c(-53, -68, -87, -90, -103)],
data=model_lin_train[c(-53, -68, -87, -90, -103),])
summary(lm_mod10)
##
## Call:
## lm(formula = formula(lm_mod7), data = model_lin_train[c(-53,
## -68, -87, -90, -103), ], weights = lm_mod8$w[c(-53, -68,
## -87, -90, -103)])
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.39955 -0.13169 -0.00021 0.12864 0.41720
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.563721647 0.037189860 391.605 < 0.0000000000000002 ***
## area 0.000051366 0.000005254 9.776 < 0.0000000000000002 ***
## mainroad 0.088784687 0.031173748 2.848 0.004648 **
## guestroom 0.065491056 0.027695872 2.365 0.018571 *
## basement 0.085828585 0.024084842 3.564 0.000415 ***
## bed3 0.053680427 0.026267071 2.044 0.041709 *
## bed4 0.073893471 0.036611812 2.018 0.044294 *
## bath2 0.142063722 0.026492819 5.362 0.0000001462959 ***
## floor2 0.084921654 0.024516679 3.464 0.000596 ***
## floor3 0.203949148 0.048755612 4.183 0.0000360781643 ***
## floor4plus 0.298337827 0.044190686 6.751 0.0000000000581 ***
## car1 0.060818177 0.024306795 2.502 0.012784 *
## car2 0.094709759 0.027112575 3.493 0.000536 ***
## semifurnished 0.141179923 0.023615878 5.978 0.0000000053860 ***
## furnished 0.143142302 0.026980739 5.305 0.0000001957569 ***
## ac 0.145829668 0.023516123 6.201 0.0000000015211 ***
## neighborhood 0.126779027 0.025075230 5.056 0.0000006799222 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1835 on 364 degrees of freedom
## Multiple R-squared: 0.7225, Adjusted R-squared: 0.7103
## F-statistic: 59.24 on 16 and 364 DF, p-value: < 0.00000000000000022
ModelAnalysis(lm_mod10)
## [1] "--------------------------------------------------"
## lm(formula = formula(lm_mod7), data = model_lin_train[c(-53,
## -68, -87, -90, -103), ], weights = lm_mod8$w[c(-53, -68,
## -87, -90, -103)])
## [1] ""
## [1] "Shapiro test for normality: The p-value of 0.0016011455567499 is <= 0.05, so reject the null; i.e., the residuals are NOT NORMAL"
## [1] ""
## [1] "Breusch-Pagan test for homoschedasticity: The p-value of 0.699944941564394 and test statistic of 12.6251150042379 are inconclusive, so homoschedasticity can't be determined using this test. But since the p-value is > 0.05, it is reasonable to conclude that the residuals are HOMOSCHEDASTIC."
## [1] ""
## [1] "Variance inflation factor (VIF)"
## [1] "<=1: not correlated, 1-5: moderately correlated, >5: strongly correlated"
## bed4 bed3 floor4plus floor2 furnished
## 2.033378 1.833734 1.627263 1.559638 1.501208
## semifurnished basement bath2 area ac
## 1.458014 1.408550 1.329111 1.312546 1.277562
## floor3 car2 guestroom neighborhood car1
## 1.270650 1.255128 1.248403 1.199533 1.189140
## mainroad
## 1.170717
## [1] ""
## [1] "Model scores:"
## [1] " adjusted R-squared: 0.71"
## [1] " AIC: -164.911"
## [1] " BIC: -93.94"
## [1] " Mallow's Cp: 157.372"
## [1] " mean squared error: 0.032"
## [1] ""
## [1] "Leverage point cutoff: 0.094488188976378"
## [1] ""
## [1] "First 10 points of influence:"
## [1] " case #87: 0.099"
## [1] ""
\(~\)
Perform five-fold cross validation to validate our results.
# Five-fold cross validation
set.seed(777)
tc <- trainControl(method = "cv", number = 5)
lmcv <- train(formula(lm_mod8), weights=lm_mod8$w, data=model_lin_train, method="lm", trControl=tc)
print(lmcv)
## Linear Regression
##
## 386 samples
## 16 predictor
##
## No pre-processing
## Resampling: Cross-Validated (5 fold)
## Summary of sample sizes: 308, 309, 310, 309, 308
## Resampling results:
##
## RMSE Rsquared MAE
## 0.221452 0.6481765 0.169662
##
## Tuning parameter 'intercept' was held constant at a value of TRUE
summary(lmcv)
##
## Call:
## lm(formula = .outcome ~ ., data = dat, weights = wts)
##
## Weighted Residuals:
## Min 1Q Median 3Q Max
## -0.39873 -0.13163 0.00627 0.12850 0.41563
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.576837195 0.036640378 397.835 < 0.0000000000000002 ***
## area 0.000049910 0.000005072 9.840 < 0.0000000000000002 ***
## mainroad 0.084162100 0.030798460 2.733 0.006585 **
## guestroom 0.066825026 0.027117946 2.464 0.014186 *
## basement 0.092910569 0.023534417 3.948 0.00009443484523 ***
## bed3 0.047228877 0.025581852 1.846 0.065666 .
## bed4 0.071650779 0.035800323 2.001 0.046081 *
## bath2 0.145767340 0.026146814 5.575 0.00000004789330 ***
## floor2 0.087224376 0.024366301 3.580 0.000390 ***
## floor3 0.228322844 0.045009601 5.073 0.00000062234166 ***
## floor4plus 0.304215596 0.042847379 7.100 0.00000000000645 ***
## car1 0.061013969 0.024105545 2.531 0.011785 *
## car2 0.093803011 0.027068547 3.465 0.000592 ***
## semifurnished 0.136384089 0.023452238 5.815 0.00000001311493 ***
## furnished 0.139402409 0.026699824 5.221 0.00000029770305 ***
## ac 0.142091507 0.023231442 6.116 0.00000000244205 ***
## neighborhood 0.127632227 0.024725726 5.162 0.00000040033935 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1835 on 369 degrees of freedom
## Multiple R-squared: 0.7219, Adjusted R-squared: 0.7098
## F-statistic: 59.86 on 16 and 369 DF, p-value: < 0.00000000000000022
\(~\)
Run selected model against validation set.
# Huber robust linear regression
lm_valid1 <- lm(formula(lm_mod7), data=dfeval_clean)
# New linear model using Huber weights
lm_valid2 <- lm(formula(lm_mod7), weights=lm_valid1$w, data=dfeval_clean)
summary(lm_valid2)
##
## Call:
## lm(formula = formula(lm_mod7), data = dfeval_clean, weights = lm_valid1$w)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.48413 -0.13892 -0.00573 0.10969 0.57910
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 14.58537068 0.06582701 221.571 < 0.0000000000000002 ***
## area 0.00004288 0.00001030 4.165 0.00005385 ***
## mainroad 0.15379687 0.05055842 3.042 0.00280 **
## guestroom 0.02885033 0.05399483 0.534 0.59396
## basement 0.11110224 0.04393761 2.529 0.01254 *
## bed3 0.04591430 0.04918705 0.933 0.35217
## bed4 0.00312121 0.06515361 0.048 0.96186
## bath2 0.20489716 0.04302680 4.762 0.00000468 ***
## floor2 0.02741720 0.04471215 0.613 0.54073
## floor3 0.16167545 0.07131483 2.267 0.02490 *
## floor4plus 0.31817524 0.09511955 3.345 0.00105 **
## car1 0.11035014 0.04947881 2.230 0.02730 *
## car2 0.16180673 0.05024519 3.220 0.00159 **
## semifurnished 0.13481715 0.04199654 3.210 0.00164 **
## furnished 0.06643684 0.04892878 1.358 0.17667
## ac 0.18693998 0.04012040 4.659 0.00000723 ***
## neighborhood 0.10219215 0.04690472 2.179 0.03100 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2147 on 142 degrees of freedom
## Multiple R-squared: 0.6985, Adjusted R-squared: 0.6645
## F-statistic: 20.56 on 16 and 142 DF, p-value: < 0.00000000000000022
ModelAnalysis(lm_valid2)
## [1] "--------------------------------------------------"
## lm(formula = formula(lm_mod7), data = dfeval_clean, weights = lm_valid1$w)
## [1] ""
## [1] "Shapiro test for normality: The p-value of 0.408160373603437 is > 0.05, so do not reject the null; i.e., the residuals are NORMAL"
## [1] ""
## [1] "Breusch-Pagan test for homoschedasticity: The p-value of 0.0365937787794498 is <= 0.05 and the test statistic is 27.4651434418931, so reject the null; i.e., the residuals are HETEROSCHEDASTIC."
## [1] ""
## [1] "Variance inflation factor (VIF)"
## [1] "<=1: not correlated, 1-5: moderately correlated, >5: strongly correlated"
## bed4 bed3 floor2 floor3 area
## 2.063947 2.025873 1.722025 1.587448 1.558627
## basement floor4plus semifurnished furnished neighborhood
## 1.493274 1.491043 1.483110 1.474260 1.428643
## car2 bath2 car1 guestroom mainroad
## 1.298417 1.277945 1.259110 1.244098 1.242820
## ac
## 1.196770
## [1] ""
## [1] "Model scores:"
## [1] " adjusted R-squared: 0.665"
## [1] " AIC: -19.984"
## [1] " BIC: 35.256"
## [1] " Mallow's Cp: 17"
## [1] " mean squared error: 0.041"
## [1] ""
## [1] "Leverage point cutoff: 0.226415094339623"
## [1] ""
## [1] "First 10 points of influence:"
## [1] " case #3: 0.236"
## [1] ""
\(~\)
# Model comparison
hdr <- c('#', 'Train/Validation', 'Linear/Robust', 'Full/Step-reduced', 'Log Transform',
'Outliers Removed', 'Huber-Weighted', 'Adj R-Sqr')
f1 <- c(seq(1:11))
f2 <- c(rep('Train', 10), 'Validation')
f3 <- c(rep('Linear', 7), 'Robust', rep('Linear', 3))
f4 <- c('Full', 'Step', 'Step', 'Full', rep('Step', 7))
f5 <- c(rep('', 3), rep('Yes', 8))
f6 <- c(rep('', 5), 'Yes', 'Yes', '', '', 'Yes', '')
f7 <- c(rep('', 8), rep('Yes', 3))
f8 <- round(c(
summary(lm_mod1)$adj.r.squared,
summary(lm_mod2)$adj.r.squared,
summary(lm_mod3)$adj.r.squared,
summary(lm_mod4)$adj.r.squared,
summary(lm_mod5)$adj.r.squared,
summary(lm_mod6)$adj.r.squared,
summary(lm_mod7)$adj.r.squared,
NA,
summary(lm_mod9)$adj.r.squared,
summary(lm_mod10)$adj.r.squared,
summary(lm_valid2)$adj.r.squared), 3)
dfresult <- data.frame(f1, f2, f3, f4, f5, f6, f7, f8)
colnames(dfresult) <- hdr
dfresult
## # Train/Validation Linear/Robust Full/Step-reduced Log Transform
## 1 1 Train Linear Full
## 2 2 Train Linear Step
## 3 3 Train Linear Step
## 4 4 Train Linear Full Yes
## 5 5 Train Linear Step Yes
## 6 6 Train Linear Step Yes
## 7 7 Train Linear Step Yes
## 8 8 Train Robust Step Yes
## 9 9 Train Linear Step Yes
## 10 10 Train Linear Step Yes
## 11 11 Validation Linear Step Yes
## Outliers Removed Huber-Weighted Adj R-Sqr
## 1 0.675
## 2 0.677
## 3 0.670
## 4 0.691
## 5 0.691
## 6 Yes 0.684
## 7 Yes 0.684
## 8 NA
## 9 Yes 0.710
## 10 Yes Yes 0.710
## 11 Yes 0.665